In [ ]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from wordcloud import WordCloud
import string
In [ ]:
import os
import random
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from typing import Union, List
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [ ]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
[nltk_data] Downloading package punkt to /usr/share/nltk_data... [nltk_data] Package punkt is already up-to-date! [nltk_data] Downloading package averaged_perceptron_tagger to [nltk_data] /usr/share/nltk_data... [nltk_data] Package averaged_perceptron_tagger is already up-to- [nltk_data] date! [nltk_data] Downloading package wordnet to /usr/share/nltk_data... [nltk_data] Package wordnet is already up-to-date! [nltk_data] Downloading package stopwords to /usr/share/nltk_data... [nltk_data] Package stopwords is already up-to-date!
Out[ ]:
True
In [ ]:
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to /usr/share/nltk_data... [nltk_data] Package wordnet is already up-to-date!
Out[ ]:
True
In [ ]:
tweet_data = pd.read_csv('cyberbullying_tweets.csv')
In [ ]:
tweet_data.head()
Out[ ]:
| tweet_text | cyberbullying_type | |
|---|---|---|
| 0 | In other words #katandandre, your food was cra... | not_cyberbullying |
| 1 | Why is #aussietv so white? #MKR #theblock #ImA... | not_cyberbullying |
| 2 | @XochitlSuckkks a classy whore? Or more red ve... | not_cyberbullying |
| 3 | @Jason_Gio meh. :P thanks for the heads up, b... | not_cyberbullying |
| 4 | @RudhoeEnglish This is an ISIS account pretend... | not_cyberbullying |
In [ ]:
tweet_data['cyberbullying_type'].value_counts()
Out[ ]:
cyberbullying_type religion 7998 age 7992 gender 7973 ethnicity 7961 not_cyberbullying 7945 other_cyberbullying 7823 Name: count, dtype: int64
In [ ]:
def extract_emojis(text):
emojis = re.findall(r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]', text)
return emojis
In [ ]:
tweet_data['emojis'] = tweet_data['tweet_text'].apply(extract_emojis)
In [ ]:
tweet_data['emojis'][21]
Out[ ]:
['😘']
In [ ]:
def preprocess_text(text):
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Remove special characters, numbers, and punctuations (except for hashtags and @mentions)
text = re.sub("[^a-zA-Z#@]", " ", text)
# Convert to lowercase
text = text.lower()
# Tokenize text
words = word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]
exclude =string.punctuation
words = [word for word in words if word not in exclude]
# Lemmatize words based on POS tags
lemmatizer = WordNetLemmatizer()
tagged_words = pos_tag(words)
lemmatized_words = []
for word, tag in tagged_words:
if tag.startswith('NN'): # Noun
pos = 'n'
elif tag.startswith('VB'): # Verb
pos = 'v'
elif tag.startswith('JJ'): # Adjective
pos = 'a'
else:
pos = 'n' # Default to noun
lemmatized_words.append(lemmatizer.lemmatize(word, pos))
text = ' '.join(lemmatized_words)
return text
In [ ]:
#!pip install --upgrade nltk
In [ ]:
nltk.download('wordnet')
[nltk_data] Downloading package wordnet to /usr/share/nltk_data... [nltk_data] Package wordnet is already up-to-date!
Out[ ]:
True
In [ ]:
tweet_data['clean_text']= tweet_data['tweet_text'].apply(preprocess_text)
In [ ]:
tweet_data
Out[ ]:
| tweet_text | cyberbullying_type | emojis | clean_text | |
|---|---|---|---|---|
| 0 | In other words #katandandre, your food was cra... | not_cyberbullying | [] | word katandandre food crapilicious mkr |
| 1 | Why is #aussietv so white? #MKR #theblock #ImA... | not_cyberbullying | [] | aussietv white mkr theblock imacelebrityau tod... |
| 2 | @XochitlSuckkks a classy whore? Or more red ve... | not_cyberbullying | [] | xochitlsuckkks classy whore red velvet cupcake |
| 3 | @Jason_Gio meh. :P thanks for the heads up, b... | not_cyberbullying | [] | jason gio meh p thanks head concern another an... |
| 4 | @RudhoeEnglish This is an ISIS account pretend... | not_cyberbullying | [] | rudhoeenglish isi account pretend kurdish acco... |
| ... | ... | ... | ... | ... |
| 47687 | Black ppl aren't expected to do anything, depe... | ethnicity | [] | black ppl expect anything depend anything yet ... |
| 47688 | Turner did not withhold his disappointment. Tu... | ethnicity | [] | turner withhold disappointment turner call cou... |
| 47689 | I swear to God. This dumb nigger bitch. I have... | ethnicity | [] | swear god dumb nigger bitch get bleach hair re... |
| 47690 | Yea fuck you RT @therealexel: IF YOURE A NIGGE... | ethnicity | [] | yea fuck rt therealexel youre nigger fuck unfo... |
| 47691 | Bro. U gotta chill RT @CHILLShrammy: Dog FUCK ... | ethnicity | [] | bro u get ta chill rt chillshrammy dog fuck kp... |
47692 rows × 4 columns
In [ ]:
# import emoji
In [ ]:
#!pip install demoji
Collecting demoji Downloading demoji-1.1.0-py3-none-any.whl.metadata (9.2 kB) Downloading demoji-1.1.0-py3-none-any.whl (42 kB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.9/42.9 kB 622.2 kB/s eta 0:00:000:00:01 Installing collected packages: demoji Successfully installed demoji-1.1.0
In [ ]:
import demoji
demoji.download_codes()
C:\Users\hp\AppData\Local\Temp\ipykernel_6064\1306366529.py:3: FutureWarning: The demoji.download_codes attribute is deprecated and will be removed from demoji in a future version. It is an unused attribute as emoji codes are now distributed directly with the demoji package. demoji.download_codes()
In [ ]:
tweet_data['emojis_description'] = tweet_data['emojis'].apply(lambda x: ' '.join(demoji.findall(' '.join(x)).values()) if x else '')
print(tweet_data['emojis_description'])
0
1
2
3
4
..
47687
47688
47689
47690
47691
Name: emojis_description, Length: 47692, dtype: object
In [ ]:
tweet_data['emojis_description']= tweet_data['emojis_description'].apply(preprocess_text)
In [ ]:
tweet_data['combined_text'] = tweet_data['clean_text'] + ' ' + tweet_data['emojis_description']
In [ ]:
In [ ]:
tweet_data['combined_text'][21]
Out[ ]:
'kid love mohamad bin zayed city face blow kiss'
In [ ]:
from transformers import BertTokenizer, BertModel
import torch
In [ ]:
text_label_0 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'religion']['tweet_text'].values)
text_label_1 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'age']['tweet_text'].values)
text_label_2 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'gender']['tweet_text'].values)
text_label_3 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'ethnicity']['tweet_text'].values)
text_label_4 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'not_cyberbullying']['tweet_text'].values)
text_label_5 = ' '.join(tweet_data[tweet_data['cyberbullying_type'] == 'other_cyberbullying']['tweet_text'].values)
# Generate word cloud for label 0
wordcloud_label_0 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_0)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_0)
plt.axis("off")
plt.title('Word Cloud for Religion')
plt.tight_layout(pad=0)
plt.show()
wordcloud_label_1 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_1)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_1)
plt.axis("off")
plt.title('Word Cloud for Age')
plt.tight_layout(pad=0)
plt.show()
wordcloud_label_2 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_2)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_2)
plt.axis("off")
plt.title('Word Cloud for Gender')
plt.tight_layout(pad=0)
plt.show()
wordcloud_label_3 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_3)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_3)
plt.axis("off")
plt.title('Word Cloud for Ethnicity')
plt.tight_layout(pad=0)
plt.show()
wordcloud_label_4 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_4)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_4)
plt.axis("off")
plt.title('Word Cloud for Not Cyberbullying')
plt.tight_layout(pad=0)
plt.show()
wordcloud_label_5 = WordCloud(width=800, height=800, max_words=1000, background_color='white').generate(text_label_5)
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud_label_5)
plt.axis("off")
plt.title('Word Cloud for Other_cyberbullying')
plt.tight_layout(pad=0)
plt.show()
In [ ]:
print(f'There are around {int(tweet_data["combined_text"].duplicated().sum())} duplicated tweets, we will remove them.')
There are around 2367 duplicated tweets, we will remove them.
In [ ]:
tweet_data.drop_duplicates("combined_text", inplace=True)
In [ ]:
tweet_data['cyberbullying_type'].value_counts()
Out[ ]:
cyberbullying_type religion 7955 age 7886 ethnicity 7865 not_cyberbullying 7826 gender 7704 other_cyberbullying 6089 Name: count, dtype: int64
In [ ]:
tweet_data['text_len'] = [len(text.split()) for text in tweet_data.tweet_text]
In [ ]:
tweet_data.sort_values(by=['text_len'], ascending=False)
Out[ ]:
| tweet_text | cyberbullying_type | emojis | clean_text | emojis_description | combined_text | text_len | |
|---|---|---|---|---|---|---|---|
| 29205 | is feminazi an actual word with a denot…\r\n@N... | other_cyberbullying | [🙉] | feminazi actual word denot nasharchy job mean ... | hear evil monkey | feminazi actual word denot nasharchy job mean ... | 790 |
| 24516 | @NICKIMINAJ: #WutKinda\r\nAt this rate the MKR... | other_cyberbullying | [] | nickiminaj wutkinda rate mkr final december mk... | nickiminaj wutkinda rate mkr final december mk... | 692 | |
| 30752 | I don't retreat.\r\nyessssssss http://t.co/Td9... | other_cyberbullying | [] | retreat yes uh make grownups boruto look lit b... | retreat yes uh make grownups boruto look lit b... | 505 | |
| 44035 | You so black and white trying to live like a n... | ethnicity | [] | black white try live like nigger pahahahaha co... | black white try live like nigger pahahahaha co... | 329 | |
| 1317 | @EurekAlertAAAS: Researchers push to import to... | not_cyberbullying | [💩, 👍, 💔, 👎] | eurekalertaaas researcher push import top anti... | pile poo thumb broken heart thumb | eurekalertaaas researcher push import top anti... | 289 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 334 | FFS | not_cyberbullying | [] | ffs | ffs | 1 | |
| 8457 | #IndiasDaughter | gender | [] | indiasdaughter | indiasdaughter | 1 | |
| 8165 | good! | gender | [] | good | good | 1 | |
| 307 | :D | not_cyberbullying | [] | 1 | |||
| 314 | Preseason | not_cyberbullying | [] | preseason | preseason | 1 |
45325 rows × 7 columns
In [ ]:
plt.figure(figsize=(20,5))
ax = sns.countplot(x='text_len', data=tweet_data[tweet_data['text_len']<=1000], palette='Blues_r')
plt.title('Count of tweets vs length ', fontsize=20)
plt.yticks([])
ax.bar_label(ax.containers[0])
plt.ylabel('count')
plt.xlabel('')
plt.show()
In [ ]:
tweet_data = tweet_data[tweet_data['text_len'] < tweet_data['text_len'].quantile(0.995)]
In [ ]:
max_len = np.max(tweet_data['text_len'])
max_len
Out[ ]:
56
In [ ]:
MAX_LEN =100
In [ ]:
from keras.preprocessing.sequence import pad_sequences
2024-04-18 10:33:06.328212: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered 2024-04-18 10:33:06.328346: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered 2024-04-18 10:33:06.499978: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
In [ ]:
label_encoder = LabelEncoder()
tweet_data['cyberbullying_type_label'] = label_encoder.fit_transform(tweet_data['cyberbullying_type'])
print(dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
{'age': 0, 'ethnicity': 1, 'gender': 2, 'not_cyberbullying': 3, 'other_cyberbullying': 4, 'religion': 5}
/tmp/ipykernel_76/246362075.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy tweet_data['cyberbullying_type_label'] = label_encoder.fit_transform(tweet_data['cyberbullying_type'])
In [ ]:
X = tweet_data['combined_text'].values
y = tweet_data['cyberbullying_type_label'].values
In [ ]:
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'label': torch.tensor(label)
}
In [ ]:
class BERTClassifier(nn.Module):
def __init__(self, bert_model_name, num_classes):
super(BERTClassifier, self).__init__()
self.bert = BertModel.from_pretrained(bert_model_name)
self.dropout = nn.Dropout(0.1)
self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids, attention_mask=attention_mask)
pooled_output = outputs.pooler_output
x = self.dropout(pooled_output)
logits = self.fc(x)
return logits
In [ ]:
from tqdm import tqdm
def train(model, data_loader, optimizer, scheduler, device):
model.train()
total_loss = 0
with tqdm(total=len(data_loader), desc="Training") as progress_bar:
for batch in data_loader:
optimizer.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
loss = nn.CrossEntropyLoss()(outputs, labels)
loss.backward()
optimizer.step()
scheduler.step()
total_loss += loss.item()
progress_bar.update(1)
return total_loss / len(data_loader)
In [ ]:
def evaluate(model, data_loader, device):
model.eval()
predictions = list()
actual = list()
with torch.no_grad():
for batch in data_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
_, preds = torch.max(outputs, dim=1)
predictions.extend(preds.cpu().tolist())
actual.extend(labels.cpu().tolist())
return accuracy_score(y_true=actual, y_pred=predictions), classification_report(y_true=actual, y_pred=predictions)
In [ ]:
def predict_class(text, model, tokenizer,classes, max_length=128):
model.eval()
encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
with torch.no_grad():
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
_, preds = torch.max(outputs, dim=1)
return classes[preds.item()]
In [ ]:
bert_model_name = 'bert-base-uncased'
num_classes = 6
max_length = 128
batch_size = 32
num_epochs = 5
learning_rate = 2e-5
In [ ]:
train_texts, val_texts, train_labels, val_labels = train_test_split(X, y, test_size=0.2, random_state=42)
len(train_texts), len(val_texts), len(train_labels), len(val_labels)
Out[ ]:
(36014, 9004, 36014, 9004)